{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 工具包导入&数据读取\n",
    "## 工具包导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "#import lightgbm as lgb\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import gc\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'/mnt/disk0/home/zhongrunxing/jupyter_code/tianchi_safe'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据读取\n",
    "- 为了方便分析，我们读取3000万条数据进行处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/home/zhongrunxing/jupyter_code/tianchi_safe/input/'\n",
    "#train = pd.read_csv(path + 'final_train.csv',nrows=1000000)\n",
    "#train = pd.read_csv(path + 'final_test.csv',nrows=1000000)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(79288375, 4)\n"
     ]
    }
   ],
   "source": [
    "train = pd.read_csv(path + 'final_test.csv')\n",
    "print(train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征工程 & 验证结果(1-Gram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_data = train[['file_id','label']].drop_duplicates()\n",
    "# train_data.head()\n",
    "# train_data['label'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1458</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1474</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1667</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      file_id\n",
       "0           1\n",
       "97          2\n",
       "1458        3\n",
       "1474        4\n",
       "1667        5"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = train[['file_id']].drop_duplicates()\n",
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 全局特征:\n",
    "- File_id (Api): count,nunique\n",
    "- File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range\n",
    "- File_id (Return Value): count,nunique,max,min,quantile(20,40,50,60,80),std,range\n",
    "- File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### File_id (Api): count,nunique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_id</th>\n",
       "      <th>api</th>\n",
       "      <th>tid</th>\n",
       "      <th>index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>RegOpenKeyExA</td>\n",
       "      <td>2332</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>CopyFileA</td>\n",
       "      <td>2332</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>OpenSCManagerA</td>\n",
       "      <td>2332</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>CreateServiceA</td>\n",
       "      <td>2332</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>RegOpenKeyExA</td>\n",
       "      <td>2468</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   file_id             api   tid  index\n",
       "0        1   RegOpenKeyExA  2332      0\n",
       "1        1       CopyFileA  2332      1\n",
       "2        1  OpenSCManagerA  2332      2\n",
       "3        1  CreateServiceA  2332      3\n",
       "4        1   RegOpenKeyExA  2468      0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n"
     ]
    }
   ],
   "source": [
    "api_opt = ['count','nunique'] \n",
    "for opt in api_opt:\n",
    "    print(opt)\n",
    "    tmp = train.groupby(['file_id'])['api'].agg({'fileid_api_' + opt: opt}).reset_index() \n",
    "    train_data = pd.merge(train_data,tmp,how='left', on='file_id')  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_id</th>\n",
       "      <th>fileid_api_count</th>\n",
       "      <th>fileid_api_nunique</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>97</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1361</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>16</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>193</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>803</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   file_id  fileid_api_count  fileid_api_nunique\n",
       "0        1                97                  15\n",
       "1        2              1361                  40\n",
       "2        3                16                   9\n",
       "3        4               193                  34\n",
       "4        5               803                  34"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n",
      "max\n",
      "min\n",
      "median\n",
      "std\n"
     ]
    }
   ],
   "source": [
    "tid_opt = ['count','nunique','max','min','median','std'] \n",
    "for opt in tid_opt:\n",
    "    print(opt)\n",
    "    tmp = train.groupby(['file_id'])['tid'].agg({'fileid_tid_' + opt: opt}).reset_index() \n",
    "    train_data = pd.merge(train_data,tmp,how='left', on='file_id') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "secs = [0.2,0.4,0.6,0.8]\n",
    "for sec in secs: \n",
    "    train_data['fileid_tid_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['tid'].quantile(sec).values\n",
    " \n",
    "train_data['fileid_tid_range'] = train.groupby(['file_id'])['tid'].quantile(0.975).values - train.groupby(['file_id'])['tid'].quantile(0.0125).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n",
      "max\n",
      "min\n",
      "median\n",
      "std\n"
     ]
    }
   ],
   "source": [
    "index_opt = ['count','nunique','max','min','median','std'] \n",
    "for opt in index_opt:\n",
    "    print(opt)\n",
    "    tmp = train.groupby(['file_id'])['index'].agg({'fileid_index_' + opt: opt}).reset_index() \n",
    "    train_data = pd.merge(train_data,tmp,how='left', on='file_id') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "secs = [0.2,0.4,0.6,0.8]\n",
    "for sec in secs: \n",
    "    train_data['fileid_index_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['index'].quantile(sec).values\n",
    " \n",
    "train_data['fileid_index_range'] = train.groupby(['file_id'])['index'].quantile(0.975).values - train.groupby(['file_id'])['index'].quantile(0.0125).values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 全局特征的线下验证 <font color=red>( 0.0969482)</font>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 评估指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lgb_logloss(preds,data):\n",
    "    labels_ = data.get_label()\n",
    "    classes_ = np.unique(labels_) \n",
    "    preds_prob = []\n",
    "    for i in range(len(classes_)):\n",
    "        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)])\n",
    "    preds_prob_ = np.vstack(preds_prob) \n",
    "    \n",
    "    loss = [] \n",
    "    for i in range(preds_prob_.shape[1]):  # 样本个数\n",
    "        sum_ = 0  \n",
    "        for j in range(preds_prob_.shape[0]): #类别个数\n",
    "            pred = preds_prob_[j,i] # 第i个样本预测为第j类的概率\n",
    "            if  j == labels_[i]:\n",
    "                sum_ += np.log(pred)\n",
    "            else:\n",
    "                sum_ += np.log(1 - pred) \n",
    "             \n",
    "        loss.append(sum_)  \n",
    "         \n",
    "    return 'loss is: ' ,-1 * (np.sum(loss) / preds_prob_.shape[1]),False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 训练特征 & 标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_features = [col for col in train_data.columns if col!='label' and col!='file_id']\n",
    "train_label = 'label'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_X, test_X, train_Y, test_Y = train_test_split( train_data[train_features],train_data[train_label].values, test_size = 0.33) \n",
    "# del _\n",
    "# gc.collect()\n",
    "\n",
    "# train_ind = train_X.index\n",
    "# test_ind = test_X.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# dtrain = lgb.Dataset(train_X,train_Y) \n",
    "# dval   = lgb.Dataset(test_X,test_Y, reference = dtrain) \n",
    "\n",
    "# params = {\n",
    "#         'task':'train', \n",
    "#         'num_leaves': 255,\n",
    "#         'objective': 'multiclass',\n",
    "#         'num_class':8,\n",
    "#         #'min_data_in_leaf': 40,\n",
    "#         'min_data_in_leaf': 1,\n",
    "#         'learning_rate': 0.05,\n",
    "#         'feature_fraction': 0.85,\n",
    "#         'bagging_fraction': 0.9,\n",
    "#         'bagging_freq': 5, \n",
    "#         'max_bin':128,\n",
    "#         'num_threads': 10,\n",
    "#         'random_state':100\n",
    "#     }  \n",
    "# lgb_model_0_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 全局特征扩充\n",
    "- File_id + return_value分段：计数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 局部组合特征(展开形式)\n",
    "### File_id + Api  \n",
    "- File_id + Api (tid): count,nunique\n",
    "- File_id + Api (return value): nunique, max, min, median, std\n",
    "- File_id + Api (index):  nunique, max, min, median, std\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### File_id + Api (tid): count,nunique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def groupby_pivot_features(data_merge, data_orig , groupby_features,col1 = None, col2 = None, opts = None):\n",
    "    for opt in opts:\n",
    "        print(opt)\n",
    "        train_split = data_orig.groupby(['file_id',col1])[col2].agg({'fileid_' + col1 + '_'+col2+'_'+ str(opt):opt}).reset_index() \n",
    "        \n",
    "        train_split_ =  pd.pivot_table(train_split, values = 'fileid_' + col1 + '_'+col2+'_'+ str(opt), index=['file_id'],columns=[col1])\n",
    "        new_cols = [ 'fileid_' + col1 + '_'+col2+  '_' + opt + '_' + str(col) for col in train_split_.columns]\n",
    "        \n",
    "        groupby_features.append(new_cols)\n",
    "        train_split_.columns = new_cols \n",
    "\n",
    "        train_split_.reset_index(inplace = True)\n",
    "        \n",
    "        data_merge = pd.merge(data_merge,train_split_,how='left', on='file_id') \n",
    "    return data_merge,groupby_features \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n"
     ]
    }
   ],
   "source": [
    "groupby_features = []\n",
    "api_opts = ['count', 'nunique']\n",
    "train_data_,groupby_features = groupby_pivot_features(train_data, train, groupby_features, col1 = 'api', col2 = 'tid', opts = api_opts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### File_id + Api (return value): nunique, max, min, median, std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# api_opts = ['nunique','max','min','median','std']\n",
    "# train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'return_value', opts = api_opts) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  File_id + Api(index): nunique, max, min, median, std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max\n",
      "min\n",
      "median\n",
      "std\n"
     ]
    }
   ],
   "source": [
    "api_opts = ['nunique','max','min','median','std']\n",
    "train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'index', opts = api_opts) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_id</th>\n",
       "      <th>fileid_api_count</th>\n",
       "      <th>fileid_api_nunique</th>\n",
       "      <th>fileid_tid_count</th>\n",
       "      <th>fileid_tid_nunique</th>\n",
       "      <th>fileid_tid_max</th>\n",
       "      <th>fileid_tid_min</th>\n",
       "      <th>fileid_tid_median</th>\n",
       "      <th>fileid_tid_std</th>\n",
       "      <th>fileid_tid_quantile_20.0</th>\n",
       "      <th>...</th>\n",
       "      <th>fileid_api_index_std_recv</th>\n",
       "      <th>fileid_api_index_std_recvfrom</th>\n",
       "      <th>fileid_api_index_std_select</th>\n",
       "      <th>fileid_api_index_std_send</th>\n",
       "      <th>fileid_api_index_std_sendto</th>\n",
       "      <th>fileid_api_index_std_setsockopt</th>\n",
       "      <th>fileid_api_index_std_shutdown</th>\n",
       "      <th>fileid_api_index_std_socket</th>\n",
       "      <th>fileid_api_index_std_system</th>\n",
       "      <th>fileid_api_index_std_timeGetTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>97</td>\n",
       "      <td>15</td>\n",
       "      <td>97</td>\n",
       "      <td>4</td>\n",
       "      <td>2568</td>\n",
       "      <td>2332</td>\n",
       "      <td>2544</td>\n",
       "      <td>57.218548</td>\n",
       "      <td>2468.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1361</td>\n",
       "      <td>40</td>\n",
       "      <td>1361</td>\n",
       "      <td>7</td>\n",
       "      <td>2748</td>\n",
       "      <td>2472</td>\n",
       "      <td>2524</td>\n",
       "      <td>104.399149</td>\n",
       "      <td>2472.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>16</td>\n",
       "      <td>9</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>2344</td>\n",
       "      <td>2344</td>\n",
       "      <td>2344</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2344.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>193</td>\n",
       "      <td>34</td>\n",
       "      <td>193</td>\n",
       "      <td>3</td>\n",
       "      <td>2584</td>\n",
       "      <td>2452</td>\n",
       "      <td>2452</td>\n",
       "      <td>50.951508</td>\n",
       "      <td>2452.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>803</td>\n",
       "      <td>34</td>\n",
       "      <td>803</td>\n",
       "      <td>3</td>\n",
       "      <td>2780</td>\n",
       "      <td>2332</td>\n",
       "      <td>2376</td>\n",
       "      <td>201.826813</td>\n",
       "      <td>2332.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2103 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   file_id  fileid_api_count  fileid_api_nunique  fileid_tid_count  \\\n",
       "0        1                97                  15                97   \n",
       "1        2              1361                  40              1361   \n",
       "2        3                16                   9                16   \n",
       "3        4               193                  34               193   \n",
       "4        5               803                  34               803   \n",
       "\n",
       "   fileid_tid_nunique  fileid_tid_max  fileid_tid_min  fileid_tid_median  \\\n",
       "0                   4            2568            2332               2544   \n",
       "1                   7            2748            2472               2524   \n",
       "2                   1            2344            2344               2344   \n",
       "3                   3            2584            2452               2452   \n",
       "4                   3            2780            2332               2376   \n",
       "\n",
       "   fileid_tid_std  fileid_tid_quantile_20.0                ...                 \\\n",
       "0       57.218548                    2468.0                ...                  \n",
       "1      104.399149                    2472.0                ...                  \n",
       "2        0.000000                    2344.0                ...                  \n",
       "3       50.951508                    2452.0                ...                  \n",
       "4      201.826813                    2332.0                ...                  \n",
       "\n",
       "   fileid_api_index_std_recv  fileid_api_index_std_recvfrom  \\\n",
       "0                        NaN                            NaN   \n",
       "1                        NaN                            NaN   \n",
       "2                        NaN                            NaN   \n",
       "3                        NaN                            NaN   \n",
       "4                        NaN                            NaN   \n",
       "\n",
       "   fileid_api_index_std_select  fileid_api_index_std_send  \\\n",
       "0                          NaN                        NaN   \n",
       "1                          NaN                        NaN   \n",
       "2                          NaN                        NaN   \n",
       "3                          NaN                        NaN   \n",
       "4                          NaN                        NaN   \n",
       "\n",
       "   fileid_api_index_std_sendto  fileid_api_index_std_setsockopt  \\\n",
       "0                          NaN                              NaN   \n",
       "1                          NaN                              NaN   \n",
       "2                          NaN                              NaN   \n",
       "3                          NaN                              NaN   \n",
       "4                          NaN                              NaN   \n",
       "\n",
       "   fileid_api_index_std_shutdown  fileid_api_index_std_socket  \\\n",
       "0                            NaN                          NaN   \n",
       "1                            NaN                          NaN   \n",
       "2                            NaN                          NaN   \n",
       "3                            NaN                          NaN   \n",
       "4                            NaN                          NaN   \n",
       "\n",
       "   fileid_api_index_std_system  fileid_api_index_std_timeGetTime  \n",
       "0                          NaN                               NaN  \n",
       "1                          NaN                               NaN  \n",
       "2                          NaN                               NaN  \n",
       "3                          NaN                               NaN  \n",
       "4                          NaN                               NaN  \n",
       "\n",
       "[5 rows x 2103 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data_.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1阶特征的线下验证(File_id + Api)（<font color=red>0.0347293</font>）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### File_id + Index  \n",
    "- File_id + Index (api): count,nunique\n",
    "- File_id + Index (return value): nunique, max, min, median, std(暂时先搁置)\n",
    "- File_id + Index (tid):  nunique, max, min, median, std(暂时先搁置)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### File_id +Tid (api): count,nunique"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### File_id + Index特征过拟合，删除\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# delcol = []\n",
    "# for i in range(2):\n",
    "#     for item in groupby_features2[i]:\n",
    "#         delcol.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_data_.drop(delcol,axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征补充（加入index的差值特征）\n",
    "- File_id + Api (index_diff): 'nunique','max','min','median','std'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_diff = train.groupby(['file_id','tid'])['index'].diff().fillna(-999).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['index_diff'] = train_diff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_diff = train.loc[train.index_diff!=-999] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max\n",
      "min\n",
      "median\n",
      "std\n"
     ]
    }
   ],
   "source": [
    "api_opts = ['nunique','max','min','median','std']\n",
    "train_data_,groupby_features = groupby_pivot_features(train_data_, train_diff, groupby_features, col1 = 'api', col2 = 'index_diff', opts = api_opts) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 线下验证(<font color=red>0.0346954</font>)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id']\n",
    "# train_label = 'label'\n",
    "# print(len(train_features))\n",
    "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n",
    "# dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n",
    "\n",
    "# params = {\n",
    "#         'task':'train', \n",
    "#         'num_leaves': 255,\n",
    "#         'objective': 'multiclass',\n",
    "#         'num_class':6,\n",
    "#         'min_data_in_leaf': 40,\n",
    "#         'learning_rate': 0.05,\n",
    "#         'feature_fraction': 0.85,\n",
    "#         'bagging_fraction': 0.9,\n",
    "#         'bagging_freq': 5, \n",
    "#         'max_bin':128,\n",
    "#         'num_threads': 64,\n",
    "#         'random_state':100\n",
    "#     }  \n",
    "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 删除quantile,std统计变量之后的验证(<font color=red>0.0350054</font>)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]\n",
    "# train_label = 'label'\n",
    "# print(len(train_features))\n",
    "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n",
    "# dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n",
    "\n",
    "# params = {\n",
    "#         'task':'train', \n",
    "#         'num_leaves': 255,\n",
    "#         'objective': 'multiclass',\n",
    "#         'num_class':6,\n",
    "#         'min_data_in_leaf': 40,\n",
    "#         'learning_rate': 0.05,\n",
    "#         'feature_fraction': 0.85,\n",
    "#         'bagging_fraction': 0.9,\n",
    "#         'bagging_freq': 5, \n",
    "#         'max_bin':128,\n",
    "#         'num_threads': 64,\n",
    "#         'random_state':100\n",
    "#     }  \n",
    "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_data_.to_csv('/data/Data_JieZhang/TC_SAFE/train_val/train_data.csv',index = None) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征工程& 验证结果 2-Gram\n",
    "## 全局特征\n",
    "### File_id（Api_2）:count,nunique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['api_shift'] = train['api'].shift(-1)\n",
    "train['api_2'] = train['api'] +'_' + train['api_shift']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop(['api_shift'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "api_count = train['api_2'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nunique\n"
     ]
    }
   ],
   "source": [
    "api_opt = ['count','nunique'] \n",
    "for opt in api_opt:\n",
    "    print(opt)\n",
    "    tmp = train.groupby(['file_id'])['api_2'].agg({'fileid_api_2_' + opt: opt}).reset_index() \n",
    "    train_data_ = pd.merge(train_data_,tmp,how='left', on='file_id')  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 局部特征\n",
    "### File_id + tid (Api_2): count特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "api_value_counts = pd.DataFrame(api_count).reset_index()\n",
    "api_value_counts.columns = ['api_2','api_2_count']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.merge(train, api_value_counts, on ='api_2' , how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  after removing the cwd from sys.path.\n"
     ]
    }
   ],
   "source": [
    "api_opts = ['count']\n",
    "groupby_features =  []\n",
    "train_data_,groupby_features = groupby_pivot_features(train_data_, train.loc[train.api_2_count>=20], groupby_features, col1 = 'api_2', col2 = 'tid', opts = api_opts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 线下验证(<font color=red> 0.0330886</font>)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### File_id + index (Api_2): max,min特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]\n",
    "# train_label = 'label'\n",
    "\n",
    "# train_ind = train_X.index\n",
    "# test_ind = test_X.index\n",
    "\n",
    "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n",
    "# dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n",
    "\n",
    "# params = {\n",
    "#         'task':'train', \n",
    "#         'num_leaves': 255,\n",
    "#         'objective': 'multiclass',\n",
    "#         'num_class':8,\n",
    "#         'min_data_in_leaf': 10,\n",
    "#         #'min_data_in_leaf': 1,\n",
    "#         'learning_rate': 0.05,\n",
    "#         'feature_fraction': 0.85,\n",
    "#         'bagging_fraction': 0.9,\n",
    "#         'bagging_freq': 5, \n",
    "#         'max_bin':128,\n",
    "#         'num_threads': 64,\n",
    "#         'random_state':100\n",
    "#     }  \n",
    "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fea_imp = pd.DataFrame({'feature':train_features, 'imp':lgb_model_3_order.feature_importance()}).sort_values('imp')\n",
    "# important_features = fea_imp.loc[fea_imp.imp >=1, 'feature'].values\n",
    "# important_features = list(important_features)\n",
    "\n",
    "# important_features.append('file_id')\n",
    "# important_features.append('label')\n",
    "\n",
    "# train_data_[important_features].to_csv('../feature_final/train_data_2gram.csv',index = None)\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data_.to_csv('input/test_data_2gram.csv',index = None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 附录\n",
    "tf-idf的1Gram特征可以替换api的次数特征等，加入tf-idf有提升，提升较小"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "384px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
